# -*- coding: utf-8 -*
import re
import pandas as pd

file =open("test_2.txt","r",encoding="UTF-8")#打开文件
content = file.read()#读取所有文件内容
file.close()#关闭文件
rawResults = re.findall(">.*?<",content,re.S)
firstStepResults  = []
for result in rawResults:
    #print(result)
    if ">\'][\'<"  in result:
        continue
    if ">:<"  in result:
        continue
    if ">回复<"  in result:
        continue
    if "><"  in result:
        continue
    if ">\', \'<"  in result:
        continue
    if "@"  in result:
        continue
    if "> <"  in result:
        continue
    # if">:" in result:
    #     continue
    else:
        firstStepResults.append(result)
subTextHead = re.compile(">")
subTextFoot = re.compile("<")
i = 1
file2 =open("hotcomment_biye.txt","w",encoding="UTF-8")
# 重点，采用这种方式构造数组的优点
mycomment = [[0 for _ in range(2)] for _ in range(len(firstStepResults))]
for lastResult in firstStepResults:
    resultExcel1 = re.sub(subTextHead, '', lastResult)
    resultExcel = re.sub(subTextFoot, '', resultExcel1)
    print(i,resultExcel.replace(":",""))
    file2.write(resultExcel.replace(":","")+'\n');
    mycomment[i - 1][0] = i
    mycomment[i - 1][1] = resultExcel.replace(":","")
    i+=1
df = pd.DataFrame(mycomment, columns=['排序', '评论', ])
df.to_excel('comment.xlsx', sheet_name='ranklist', index=False)
file2.close()